This project is trying to investigate if investment in different industry segments would impact on the amount of CO2 emission. Varibales being choosed are CO2 emissions total, CO2 emissions per capita, CO2 emissions per GDP, investment in energy, investment in telecommunication, investment in transportation, investment in water.
data<-read_xls("climate_change_download_0.xls",sheet = 1,col_types = "guess",na = "..")
summary(data)
## Country code Country name Series code
## Length:13512 Length:13512 Length:13512
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Series name SCALE Decimals 1990
## Length:13512 Min. :0 Min. :0.000 Min. :-3.000e+03
## Class :character 1st Qu.:0 1st Qu.:0.000 1st Qu.: 4.000e+00
## Mode :character Median :0 Median :1.000 Median : 8.100e+01
## Mean :0 Mean :0.581 Mean : 1.698e+10
## 3rd Qu.:0 3rd Qu.:1.000 3rd Qu.: 2.704e+03
## Max. :0 Max. :1.000 Max. : 2.190e+13
## NA's :3495 NA's :3495 NA's :8658
## 1991 1992 1993
## Min. :-8.190e+02 Min. :-8.040e+02 Min. :-7.590e+02
## 1st Qu.: 4.000e+00 1st Qu.: 4.000e+00 1st Qu.: 4.000e+00
## Median : 1.000e+02 Median : 1.010e+02 Median : 1.000e+02
## Mean : 2.454e+10 Mean : 2.503e+10 Mean : 2.493e+10
## 3rd Qu.: 4.131e+03 3rd Qu.: 4.220e+03 3rd Qu.: 4.243e+03
## Max. : 2.297e+13 Max. : 2.452e+13 Max. : 2.488e+13
## NA's :10015 NA's :9859 NA's :9795
## 1994 1995 1996
## Min. :-8.500e+02 Min. :-7.950e+02 Min. :-6.800e+02
## 1st Qu.: 4.000e+00 1st Qu.: 7.000e+00 1st Qu.: 4.000e+00
## Median : 1.020e+02 Median : 1.000e+02 Median : 1.070e+02
## Mean : 2.635e+10 Mean : 2.379e+10 Mean : 3.009e+10
## 3rd Qu.: 5.409e+03 3rd Qu.: 5.863e+03 3rd Qu.: 8.394e+03
## Max. : 2.672e+13 Max. : 2.967e+13 Max. : 3.028e+13
## NA's :9733 NA's :8840 NA's :9708
## 1997 1998 1999
## Min. :-7.280e+02 Min. :-6.020e+02 Min. :-5.040e+02
## 1st Qu.: 4.000e+00 1st Qu.: 5.000e+00 1st Qu.: 6.000e+00
## Median : 1.260e+02 Median : 1.160e+02 Median : 1.020e+02
## Mean : 3.046e+10 Mean : 2.983e+10 Mean : 2.915e+10
## 3rd Qu.: 1.559e+04 3rd Qu.: 1.535e+04 3rd Qu.: 8.830e+03
## Max. : 3.019e+13 Max. : 3.008e+13 Max. : 3.121e+13
## NA's :9745 NA's :9694 NA's :9507
## 2000 2001 2002
## Min. :-5.400e+02 Min. :-6.360e+02 Min. :-8.310e+02
## 1st Qu.: 5.000e+00 1st Qu.: 5.000e+00 1st Qu.: 5.000e+00
## Median : 9.300e+01 Median : 1.030e+02 Median : 1.020e+02
## Mean : 2.196e+10 Mean : 2.995e+10 Mean : 3.078e+10
## 3rd Qu.: 3.602e+03 3rd Qu.: 1.053e+04 3rd Qu.: 7.915e+03
## Max. : 3.221e+13 Max. : 3.201e+13 Max. : 3.327e+13
## NA's :8016 NA's :9494 NA's :9455
## 2003 2004 2005
## Min. :-9.800e+02 Min. :-1.034e+03 Min. :-1.028e+03
## 1st Qu.: 5.000e+00 1st Qu.: 5.000e+00 1st Qu.: 8.000e+00
## Median : 1.020e+02 Median : 1.000e+02 Median : 1.000e+02
## Mean : 3.501e+10 Mean : 3.812e+10 Mean : 3.471e+10
## 3rd Qu.: 8.840e+03 3rd Qu.: 6.497e+03 3rd Qu.: 6.348e+03
## Max. : 3.744e+13 Max. : 4.219e+13 Max. : 4.562e+13
## NA's :9469 NA's :9287 NA's :8428
## 2006 2007 2008
## Min. :-1.015e+03 Min. :-1.013e+03 Min. :-1.007e+03
## 1st Qu.: 5.000e+00 1st Qu.: 5.000e+00 1st Qu.: 6.000e+00
## Median : 1.000e+02 Median : 1.010e+02 Median : 9.900e+01
## Mean : 4.585e+10 Mean : 5.255e+10 Mean : 5.415e+10
## 3rd Qu.: 7.716e+03 3rd Qu.: 9.454e+03 3rd Qu.: 5.548e+03
## Max. : 4.945e+13 Max. : 5.580e+13 Max. : 6.126e+13
## NA's :9276 NA's :9264 NA's :8909
## 2009 2010 2011
## Min. :-9.900e+02 Min. :-3.200e+01 Mode:logical
## 1st Qu.: 3.000e+00 1st Qu.: 3.000e+00 TRUE:466
## Median : 9.200e+01 Median : 1.030e+02 NA's:13046
## Mean : 6.287e+10 Mean : 1.104e+11
## 3rd Qu.: 6.840e+03 3rd Qu.: 4.902e+06
## Max. : 5.808e+13 Max. : 6.304e+13
## NA's :9751 NA's :11180
mydata<-dplyr::filter(data,`Series code` =="IE.PPI.ENGY.CD" |`Series code` == "IE.PPI.TELE.CD"|`Series code` =="IE.PPI.TRAN.CD" |`Series code` =="IE.PPI.WATR.CD"|`Series code` =="EN.ATM.CO2E.PC"|`Series code` =="EN.ATM.CO2E.PP.GD.KD"|`Series code` =="EN.ATM.CO2E.KT")
names(mydata)[1]<-"countrycode"
names(mydata)[2]<-"countryname"
names(mydata)[3]<-"seriescode"
names(mydata)[4]<-"seriesname"
mydata<-mydata %>% as.data.frame() %>% melt(measure.var=7:28)
names(mydata)[7]<-"year"
summary(mydata)
## countrycode countryname seriescode
## Length:35882 Length:35882 Length:35882
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## seriesname SCALE Decimals year
## Length:35882 Min. :0 Min. :0.0000 1990 : 1631
## Class :character 1st Qu.:0 1st Qu.:0.0000 1991 : 1631
## Mode :character Median :0 Median :0.0000 1992 : 1631
## Mean :0 Mean :0.4286 1993 : 1631
## 3rd Qu.:0 3rd Qu.:1.0000 1994 : 1631
## Max. :0 Max. :1.0000 1995 : 1631
## (Other):26096
## value
## Min. :-3.000e+03
## 1st Qu.: 8.000e+00
## Median : 4.630e+02
## Mean : 4.726e+08
## 3rd Qu.: 2.167e+05
## Max. : 7.715e+10
## NA's :20700
names(mydata)
## [1] "countrycode" "countryname" "seriescode" "seriesname" "SCALE"
## [6] "Decimals" "year" "value"
spreadtest<-tidyr::unite(mydata,"series",c("seriesname","seriescode"),sep="",remove=TRUE,na.rm=FALSE)
spreadtest$SCALE=NULL
spreadtest$Decimals=NULL
spreadtest<-tidyr::spread(spreadtest,series,value)
names(spreadtest)[4]<-"CO2percapita"
names(spreadtest)[5]<-"CO2perGDP"
names(spreadtest)[6]<-"CO2total"
names(spreadtest)[7]<-"investenergy"
names(spreadtest)[8]<-"investtelecom"
names(spreadtest)[9]<-"investtransport"
names(spreadtest)[10]<-"investwater"
selected<-filter(spreadtest,countrycode=="ARG"|countrycode=="BRA"|countrycode=="ARG"|countrycode=="CHL"|countrycode=="CHN"|countrycode=="COL"|countrycode=="IDN"|countrycode=="IND"|
countrycode=="MEX"|countrycode=="MYS"|countrycode=="PER"|countrycode=="PHL"|countrycode=="RUS"|countrycode=="THA")
table1<-xtabs(~CO2total+countryname,data=spreadtest)
table1<-aggregate(CO2total~countryname,rm.na=TRUE,data=spreadtest,FUN = sum)
table2<-xtabs(~CO2percapita+countryname,data=spreadtest)
table2<-aggregate(CO2percapita~countryname,rm.na=TRUE,data=spreadtest,FUN=sum)
table3<-xtabs(~CO2perGDP+countryname,data=spreadtest)
table3<-aggregate(CO2perGDP~countryname,rm.na=TRUE,data=spreadtest,FUN = sum)
#total CO2 emission between year 1990 and year 2011 for all countries
ggplot(spreadtest,aes(x=countryname,y=CO2total,color=year))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#CO2 percapita between year 1990 and year 2011 for all countries
ggplot(spreadtest,aes(x=countryname,y=CO2percapita,color=year))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#CO2 perGDP between year 1990 and year 2011 for all countries
ggplot(selected,aes(x=countryname,y=CO2perGDP,color=year))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#the overall trend of total CO2 emission for all coutries considered as a population
ggplot(spreadtest,aes(x=year,y=CO2total))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#the overall trend of CO2 emission per capita for all coutries considered as a population
ggplot(spreadtest,aes(x=year,y=CO2percapita))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#the overall trend of CO2 emission per GDP for selected countries that have continious investments on different industries
ggplot(selected,aes(x=as.numeric(as.character(year)),y=CO2perGDP,color=countryname))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#the CO2 emission per GDP between year 1990 and year 2011 for selected countries
plot_ly(selected,x=~as.numeric(as.character(year)),y=~CO2perGDP,type='scatter',mode='lines')
#the CO2 total emission between year 1990 and year 2011 for selected countries
plot_ly(selected,x=~as.numeric(as.character(year)),y=~CO2total,type='scatter',mode='lines')
#the CO2 emission per capita between year 1990 and year 2011 for selected countries
plot_ly(selected,x=~as.numeric(as.character(year)),y=~CO2percapita,type='scatter',mode='lines')
3.2. Top emission from country (CO2 emission per GDP)
clean<-na.omit(spreadtest)
topemission<-arrange(table3,CO2perGDP)
ggplot(head(topemission,n=10),aes(countryname,CO2perGDP,fill=countryname,label=paste(CO2perGDP)))+geom_bar(sta='identity',color='black')+
coord_flip()+theme_bw()+theme(legend.position = 'none')+geom_text(aes(y=1),hjust=0,vjust=0.5,size=4)+
labs(title='Top countries with highest CO2 emission per GDP')
3.3 Comparing Investment in Different Segments with CO2 Emission for Each Selected Country
ARG<-filter(selected,countrycode=="ARG")
plot_ly(ARG,x=~ARG$year,y=~ARG$CO2percapita,name="CO2percapita",type='scatter',mode='lines') %>%
add_trace(y=~scale(ARG$investenergy),name='energy',mode="lines+markers") %>%
add_trace(y=~scale(ARG$investtelecom),name='telecom',mode='lines+markers') %>%
add_trace(y=~scale(ARG$investtransport),name='transport',mode='lines+markers')
3.4 Use the 13 Countries as a Sample to Test if Investment in Different Segments would Impact the Amount of CO2 Emission
sample<-na.omit(selected[3:10])
trace_0<-aggregate(sample$CO2percapita,by=list(year=sample$year),FUN=sum)
trace_1<-aggregate(sample$investenergy,by=list(year=sample$year),FUN=sum)
trace_2<-aggregate(sample$investtelecom,by=list(year=sample$year),FUN=sum)
trace_3<-aggregate(sample$investtransport,by=list(year=sample$year),FUN=sum)
plot_ly(sample,x=~trace_0$year,y=~trace_0$x,name="CO2percapita",type='scatter',mode='lines') %>%
add_trace(y=~scale(trace_1$x),name='energy',mode='lines+markers') %>%
add_trace(y=~scale(trace_2$x),name='telecom',mode='lines+markers') %>%
add_trace(y=~scale(trace_3$x),name='transport',mode='lines+markers')
cor.test(sample$CO2percapita,sample$investenergy)
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 0.52495, df = 109, p-value = 0.6007
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1374618 0.2344165
## sample estimates:
## cor
## 0.05021779
cor.test(sample$CO2percapita,sample$investtelecom)
##
## Pearson's product-moment correlation
##
## data: x and y
## t = -0.39648, df = 109, p-value = 0.6925
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2227658 0.1495018
## sample estimates:
## cor
## -0.03794859
cor.test(sample$CO2percapita,sample$investtransport)
##
## Pearson's product-moment correlation
##
## data: x and y
## t = -0.85622, df = 109, p-value = 0.3938
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2641061 0.1062751
## sample estimates:
## cor
## -0.08173693
Based on both the EDA and the correlation test, investment in difference segments does not have significant impact on CO2 emission. Limitations would be: 1. The 13 countries, as a sample, could be not representative regarding the whole world. 2. Simple linear coefficient model shoule be validated using residual check.